Take Home Exercise 2

Peer study

Huang Anni (Singapore Management University)
05-02-2022

The task

In this take-home exercise 2, we are required to:

You can check out the full code on GitHub.

Introduction

This exercise requires us to apply the skills you had learned in Lesson 1 and Hands-on Exercise 1 to reveal the demographic of the city of Engagement, Ohio USA by using appropriate static statistical graphics methods. The data should be processed by using appropriate tidyverse family of packages and the statistical graphics must be prepared using ggplot2 and its extensions.

Fig 1 Distribution of age groups

p1 <- ggplot(data=data) + aes(x=age) +
  geom_histogram(bins=20, boundary=60, color="black", fill="grey") +
  ggtitle("Distribution of Participants' Age")
p1

1.1 Critics

1.1.1 Athetic:

This graph is clear and it uses the right graph type(bar) to present the content(distribution of age group). We can make it more attractive by making it colored and remove the unnecessary grey background color and ticks.

After those steps, the graph becomes:

p2 <- ggplot(data = data, 
       aes(x = age)) +
  ggtitle("Distribution of Participants' Age")+
  geom_bar(bins=20, boundary = 60, fill="light blue") +
  theme(axis.title.y= element_text(angle=90), axis.ticks.x= element_blank(),
        panel.background= element_blank(), axis.line= element_line(color= 'grey'))
p2

1.1.2 Clarity:

We can see the distribution of 20 age groups in the orgional graph. But it’s not clear which age group each bar is and how many people are in each age group. Besides, the graph ignores people beyond 60 years old.

After that the graph becomes:

brks <- c(17, 20, 30, 40, 50, 60, Inf)
grps <- c('<=20', '21-30', '31-40', '41-50', '51-60', '>60')
data$Age_Group <- cut(data$age, breaks=brks, labels = grps, right = FALSE)
p2 <- ggplot(data = data, 
       aes(x = Age_Group)) +
  geom_bar(fill="light blue") +
  ylim(0, 300) +
  geom_text(stat = 'count',
           aes(label= paste0(stat(count), ' (', 
                             round(stat(count)/sum(stat(count))*100, 
                             1), '%)')), vjust= -0.5, size= 2.5) +
  gghighlight(Age_Group != "<=20" & Age_Group != ">60")+
  labs(y= 'No. of\nResidents', x= 'Age Group',
       title = "Fig 1. Distribution of Residents' Age",
       subtitle = "Most of residents are in working age(20-60)") +
  theme(axis.title.y= element_text(angle=90), axis.ticks.x= element_blank(),
        panel.background= element_blank(), axis.line= element_line(color= 'grey'))
p2

1.2 Before vs after the makeover

# The original plot
p1 <- ggplot(data=data) + aes(x=age) +
  geom_histogram(bins=20, boundary=60, color="black", fill="grey") +
  ggtitle("Distribution of Participants' Age")

# The plot after makeover
brks <- c(17, 20, 30, 40, 50, 60, Inf)
grps <- c('<=20', '21-30', '31-40', '41-50', '51-60', '>60')
data$Age_Group <- cut(data$age, breaks=brks, labels = grps, right = FALSE)
p2 <- ggplot(data = data, 
       aes(x = Age_Group)) +
  geom_bar(fill="light blue") +
  ylim(0, 300) +
  geom_text(stat = 'count',
           aes(label= paste0(stat(count), ' (', 
                             round(stat(count)/sum(stat(count))*100, 
                             1), '%)')), vjust= -0.5, size= 2.5) +
  gghighlight(Age_Group != "<=20" & Age_Group != ">60")+
  labs(y= 'No. of\nResidents', x= 'Age Group',
       title = "Fig 1. Distribution of Residents' Age",
       subtitle = "Most of residents are in working age(20-60)") +
  theme(axis.title.y= element_text(angle=90), axis.ticks.x= element_blank(),
        panel.background= element_blank(), axis.line= element_line(color= 'grey'))
# group them together using patchwork
f1<- p1/p2 + plot_annotation(tag_levels = list(c('Before', 'After'), '1'))
f1

Fig 2 Distribution of age groups for people with and without kids

p1 <- ggplot(data=data, aes(x=age, fill=haveKids)) +
  geom_histogram(bins=20, color='gray30')
p1

2.1 Critics

2.1.1 Athetic:

This graph is beautiful for its color and clear legend. I have no suggestion on that.

2.1.2 Clarity:

But it’s not easy to get information in this graph. I assume that Yu Di wants to compare the portion of people with kids in different age groups. So we can make it more informative by using scatter plot whose y is the portion of people who have kids and its x is age groups.

brks <- c(17, 20, 25, 30, 35, 40, 45, 50, 55, 60, Inf)
grps <- c('<=20', '21-25','26-30','31-35', '36-40', '41-45', '46-50', '51-55','56-60', '>60')
data$Age_Group <- cut(data$age, breaks=brks, labels = grps, right = FALSE)
new_data <- data %>%
  count(Age_Group, haveKids) %>%
  group_by(Age_Group) %>%          # now required with changes to dplyr::count()
  mutate(prop = prop.table(n)) %>% 
  filter(haveKids == TRUE)
p2<-ggplot(data=new_data,
           aes(x = Age_Group, 
               y=prop, 
               size=prop, 
               color=prop)) +
  geom_point()+
  ylim(0, 0.45) +
  labs(y= 'Ratio\n of residences\n having kids', x= 'Age Group',
       title = "Fig 2. Does of ratio of having kids differ in different age groups?",
       subtitle = "The proportion of residences having kids is smaller in elder age groups.\n The ratio of having kids in each age group is around 0.2-0.4") +
  geom_text(aes(label= paste0(Age_Group, '(', round(prop,2)*100,'%)')), 
            vjust= -1.5, 
            size= 2.5,
            angle=15) +
  theme(axis.title.y= element_text(angle=90), 
        axis.ticks.x= element_blank(),
        panel.background= element_blank(), 
        axis.line= element_line(color= 'grey'))+
  guides(size='none',color="none")
p2

2.2 Before vs after makeover

p1 <- ggplot(data=data, aes(x=age, fill=haveKids)) +
  geom_histogram(bins=20, color='gray30')
#+annotate("text", x = 20, y = 75, label = "Before",size=3,color='red')

new_data <- data %>%
  count(Age_Group, haveKids) %>%
  group_by(Age_Group) %>%          # now required with changes to dplyr::count()
  mutate(prop = prop.table(n)) %>% 
  filter(haveKids == TRUE)
p2<-ggplot(data=new_data,
           aes(x = Age_Group, 
               y=prop, 
               size=prop, 
               color=prop)) +
  geom_point()+
  ylim(0, 0.45) +
  labs(y= 'Ratio\n of ppl \n having kids', x= 'Age Group',
       title = "Fig 2. Does of ratio of having kids \n differ in different age groups?",
       subtitle = "The proportion of people having kids is smaller in elder age groups.\n The ratio of having kids in each age group is around 0.2-0.4") +
  geom_text(aes(label= paste0(Age_Group, '(', round(prop,2)*100,'%)')), 
            vjust= -3, 
            size= 1.5,
            angle=30) +
  theme(axis.title.y= element_text(angle=90), 
        axis.ticks.x= element_blank(),
        panel.background= element_blank(), 
        axis.line= element_line(color= 'grey'))+
  guides(size='none',color="none")
f2<- p1/p2 + plot_annotation(tag_levels = list(c('Before', 'After'), '1'))
f2

Fig 3 The age distribution with different education level

3.1 Critics

p1 <- ggplot(data=data, aes(x=age, fill=educationLevel)) +
  geom_histogram(bins=20, color='gray30')
p1

3.1.1 Athetic:

This graph is beautiful for its color and clear legend.

3.1.2 Clarity:

But it’s not easy to get information in this graph. We can make it more informative by using scatter plot whose x is different age groups and its y is ratio of each education level.

brks <- c(17, 20, 30, 40, 50, 60, Inf)
grps <- c('<=20', '21-30','31-40', '41-50', '51-60', '>60')
data$Age_Group <- cut(data$age, breaks=brks, labels = grps, right = FALSE)
new_data <- data %>%
  count(Age_Group, educationLevel) %>%
  group_by(Age_Group) %>%          # now required with changes to dplyr::count()
  mutate(prop = prop.table(n))
new_data
# A tibble: 24 × 4
# Groups:   Age_Group [6]
   Age_Group educationLevel          n   prop
   <fct>     <ord>               <int>  <dbl>
 1 <=20      Low                     4 0.0833
 2 <=20      HighSchoolOrCollege    27 0.562 
 3 <=20      Bachelors               9 0.188 
 4 <=20      Graduate                8 0.167 
 5 21-30     Low                    21 0.0933
 6 21-30     HighSchoolOrCollege   108 0.48  
 7 21-30     Bachelors              61 0.271 
 8 21-30     Graduate               35 0.156 
 9 31-40     Low                    20 0.0820
10 31-40     HighSchoolOrCollege   129 0.529 
# … with 14 more rows
p2<-ggplot(data=new_data,
           aes(x = Age_Group, 
               y=prop, 
               size=prop, 
               color=educationLevel)) +
  geom_point()+
  facet_wrap(~ educationLevel) + 
  ylim(0, 0.9) +
  labs(y= 'Ratio', x= 'Age Group',
       title = "Fig 3. Does the ratio of education Level differ in different age groups?",
       subtitle = "Different age group has similar education backgroud") +
  geom_text(aes(label= paste0(Age_Group, '(', round(prop,2)*100,'%)')), 
            vjust= -2, 
            size= 2,
            angle=15) +
  theme(axis.title.y= element_text(angle=90), 
        axis.ticks.x= element_blank(),
        panel.background= element_blank(), 
        axis.line= element_line(color= 'grey'),
        legend.position="top")+
  guides(size="none",color="none")
p2

3.2 Before vs after makeover

Before

After

Fig 4 Distribution of age with different education level

d <- data
d_bg <- d[, -5]
p1 <- ggplot(d, aes(x = age, fill = educationLevel)) + 
  geom_histogram(data=d_bg, fill="grey", alpha=.5) +
  geom_histogram(colour="black") + 
  facet_wrap(~ educationLevel) + 
  guides(fill = "none") + 
  theme_bw()
p1

4.1 Critics

4.1.1 Athetic:

This graph is beautiful for its color and clear legend.

4.1.2 Clarity:

But it’s not easy to get information in this graph. We can make it more informative by using scatter plot whose x is different age groups and its y is ratio of each education level.

After make over:

brks <- c(17, 20, 30, 40, 50, 60, Inf)
grps <- c('<=20', '21-30', '31-40', '41-50', '51-60', '>60')
data$Age_Group <- cut(data$age, breaks=brks, labels = grps, right = FALSE)
p2 <- ggplot(data = data, 
       aes(x = Age_Group, fill=educationLevel)) +
  geom_bar() +
  ylim(0, 200) +
  geom_text(stat = 'count',
           aes(label= paste0(stat(count), ' (', 
                             round(stat(count)/sum(stat(count))*100, 
                             1), '%)')), vjust= -1, size= 1.5, angle=0) +
  gghighlight(Age_Group != "<=20" & Age_Group != ">60")+
  labs(y= 'No. of\nResidents', x= 'Age Group',
       title = "Fig 4. Distribution of Residents' Age",
       subtitle = "The distribution of age for people with different education level is similar") +
  facet_wrap(~ educationLevel) + 
  theme(axis.title.y= element_text(angle=0), axis.ticks.x= element_blank(),
        axis.text.x.bottom = element_text(size=5),
        panel.background= element_blank(), axis.line= element_line(color= 'grey')) +
  theme_bw()
p2

4.2 Before vs after makeover

Before

After

Fig 5 Distribution of joviality with vs without kids

5.1 Critics

p1<-ggplot(data=data, aes(x = joviality, colour=haveKids)) + geom_density()
p1

5.1.1 Athetic:

This graph is not very beautiful but it’s clear.

5.1.2 Clarity:

It lacks title.

new_data <- data %>%
  group_by(haveKids, householdSize)%>% 
summarise(
  joviality_mean = mean(joviality),
)
p2<-ggplot(data=new_data,
           aes(x = householdSize, 
               y=joviality_mean, 
               size=joviality_mean, 
               color=haveKids)) +
  geom_point()+
  ylim(0, 1) +
  labs(y= 'Hapiness', x= 'Household size',
       title = "Fig 5. Does happiness differ for people have vs not have kids?",
       subtitle = "People with kids is slightly happier than those who do not.\n But for people with a house hold of 2 and 3, \n there's no difference in joviality whether they have kids.") +
  geom_text(aes(label= paste0("Household size=",householdSize, '\nHappiness=', round(joviality_mean,2)*100,'%')), 
            vjust= -1, 
            size= 4,
            angle=0) +
  theme(axis.title.y= element_text(angle=90), 
        axis.ticks.x= element_blank(),
        panel.background= element_blank(), 
        axis.line= element_line(color= 'grey'),
        legend.position="top")+
  guides(size="none")
p2

5.2 Before vs after

Before

After

Fig 6 Boxplot of joviality with vs without kids

6.1 Critics

p1<- ggplot(data=data, aes(y = joviality, x= haveKids)) + 
  geom_violin(fill='light blue') + 
  geom_boxplot(notch=TRUE) +
  stat_summary(geom = "point", fun="mean", colour ="red", size=4)
p1

6.1.1 Athetic:

This graph is beautiful.

6.1.2 Clarity:

It lacks title.

p2<- ggplot(data=data, aes(y = joviality, x= haveKids)) + 
  geom_violin(fill='light blue') + 
  geom_boxplot(notch=TRUE) +
  stat_summary(geom = "point", fun="mean", colour ="red", size=4)+
  labs(y= 'Joviality', x= 'Have Kids',
       title = "Fig 6. Does having kids change happiness?",
       subtitle = "Yes, people who have kids is happier") +
  geom_hline(aes(yintercept = 0.5),
             linetype= 'dashed',
             color= '#f08080',
             size= .6)+
  gghighlight(haveKids == TRUE)+
  theme_bw()
p2

6.2 Before vs after

f6<- p1/p2 + plot_annotation(tag_levels = list(c('Before', 'After'), '1'))
f6

Fig 7 Distribution of joviality with different education level

p1<-ggplot(data=data, aes(x= joviality)) + geom_histogram(bins=20) + 
  facet_wrap(~ educationLevel)
p1

7.1 Critics

7.1.1 Athetic:

7.1.2 Clarity:

It lacks title.

new_data <- data %>%
  group_by(educationLevel)%>% 
summarise(
  joviality_mean = mean(joviality),
)
p2<-ggplot(data=new_data,
           aes(x = educationLevel, 
               y=joviality_mean, 
               size=joviality_mean, 
               color=educationLevel)) +
  geom_point()+
  ylim(0, 1) +
  labs(y= 'Hapiness', x= 'Education Level',
       title = "Fig 7. Does happiness differ with different education levels?",
       subtitle = "People is happier with higher education level") +
  geom_text(aes(label= paste0(round(joviality_mean,2)*100,'%')), 
            vjust= -1, 
            size= 4,
            angle=0) +
  theme(axis.title.y= element_text(angle=90), 
        axis.ticks.x= element_blank(),
        panel.background= element_blank(), 
        axis.line= element_line(color= 'grey'),
        legend.position="top")+
  guides(size="none",color="none")
p2

7.2 Before vs after

f7<- p1/p2 + plot_annotation(tag_levels = list(c('Before', 'After'), '1'))
f7

Fig 8 Boxplot of joviality within different interest groups

p1<-ggplot(data=data, aes(y = joviality, x= interestGroup)) + geom_boxplot() +
  facet_grid(educationLevel ~.)

8.1 Critics

8.1.1 Athetic:

8.1.2 Clarity:

new_data <- data %>%
  group_by(educationLevel,interestGroup)%>% 
summarise(
  joviality_mean = mean(joviality),
)
p2<-ggplot(data=new_data,
           aes(x = interestGroup, 
               y=joviality_mean, 
               size=joviality_mean, 
               color=educationLevel)) +
  geom_point()+
  facet_grid(educationLevel~.)+
  ylim(0, 1) +
  labs(y= 'Hapiness', x= 'Interest Group',
       title = "Fig 8. Does happiness differ with different education levels and interest groups?",
       subtitle = "Generally, people is happier with higher education level.\n(H: higher education happier, E,H: no diff, A: higher education less happy)") +
  geom_text(aes(label= paste0(round(joviality_mean,2)*100,'%')), 
            vjust= -1, 
            size= 4,
            angle=0) +
  theme(axis.title.y= element_text(angle=90), 
        axis.ticks.x= element_blank(),
        panel.background= element_blank(), 
        axis.line= element_line(color= 'grey'),
        legend.position="top")+
  guides(size="none",color="none")
p2

8.2 Before vs after

Before

After

Fig 9 Scatterplot of joviality for different ages

dpp <- data %>%
  group_by(age) %>%
  summarise(joviality = mean(joviality))
p1 <- ggplot(data=dpp, aes(x=age, y=joviality)) + geom_point() +
  coord_cartesian(xlim=c(20, 60), ylim=c(0, 1)) + 
  geom_hline(yintercept=0.5, linetype="dashed", color="grey60", size=1) +  
  geom_vline(xintercept=40, linetype="dashed", color="grey60", size=1)
p1

9.1 Critics

9.1.1 Athetic:

9.1.2 Clarity:

dpp <- data %>%
  group_by(Age_Group) %>%
  summarise(joviality = mean(joviality))
p2 <- ggplot(data=dpp, aes(x=Age_Group, y=joviality, color=joviality)) + geom_point() +
  geom_hline(yintercept=0.5, linetype="dashed", color="grey60", size=1) +  
  guides(size='none')+
  ylim(0.3, 0.65) +
  labs(y= 'Hapiness', x= 'Age Group',
       title = "Fig 9. Does happiness differ with age?",
       subtitle = "People become less happy as growing old") +
  geom_text(aes(label= paste0(round(joviality,2)*100,'%')), 
            vjust= -0.5, 
            size= 3,
            angle=0) +
  gghighlight(joviality > 0.5)+
  theme(axis.title.y= element_text(angle=90), 
        axis.ticks.x= element_blank(),
        panel.background= element_blank(), 
        axis.line= element_line(color= 'grey'),
        legend.position="top")
p2

9.2 Before vs after

f9<- p1/p2 + plot_annotation(tag_levels = list(c('Before', 'After'), '1'))
f9

Fig 10 Barplot of householdSize

10.1 Critics

p1<-ggplot(data=data, aes(x=householdSize)) + geom_bar() + coord_flip()

10.1.1 Athetic:

10.1.2 Clarity:

p2<-ggplot(data=data, aes(x=householdSize)) + 
  geom_bar(fill="light blue") +
  geom_text(stat = 'count',
           aes(label= paste0(stat(count), ' (', 
                             round(stat(count)/sum(stat(count))*100, 
                             1), '%)')), vjust= -1, size= 1.5, angle=0) +
  ylim(0, 400) +
  labs(y= 'No. of\nResidents', x= 'Household Size',
       title = "Fig 4. Distribution of Household Size",
       subtitle = "") +
  theme(axis.title.y= element_text(angle=90), axis.ticks.x= element_blank(),
        axis.text.x.bottom = element_text(size=10),
        panel.background= element_blank(), axis.line= element_line(color= 'grey')) +
  theme_bw()
p2

10.2 Before vs. after

f10<- p1/p2 + plot_annotation(tag_levels = list(c('Before', 'After'), '1'))
f10

Overview of all the graphs